Preliminaries¶

In [1]:
%matplotlib inline

# IMPORT MODULES
import sys
import os
import time
import warnings
import platform
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt 
import sklearn
from scipy.stats import norm
import seaborn as sns 
import hyperopt
import scipy
import xgboost

# Print Anaconda and Python versions
print(f"Anaconda Version is {sys.version}")
print(f"Python version is {platform.python_version()}")

# Print versions of imported libraries
print(f"Pandas version is: {pd.__version__}")
print(f"Numpy version is: {np.__version__}")
print(f"Matplotlib version is: {matplotlib.__version__}")
print(f"Sklearn version is: {sklearn.__version__}")
print(f"Seaborn version is: {sns.__version__}")
print(f"Hyperopt version is: {hyperopt.__version__}")
print(f"Scipy version is: {scipy.__version__}")
print(f"Xgboost version is: {xgboost.__version__}")

# Set seaborn settings
sns.set(context='paper', palette='winter_r', style='darkgrid', rc={'figure.facecolor': 'gray'}, font_scale=1.5)

# Filter out warnings (deprecations, etc.)
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses
Anaconda Version is 3.10.13 (main, Mar 12 2024, 12:16:25) [GCC 12.2.0]
Python version is 3.10.13
Pandas version is: 2.2.1
Numpy version is: 1.26.4
Matplotlib version is: 3.8.3
Sklearn version is: 1.4.1.post1
Seaborn version is: 0.13.2
Hyperopt version is: 0.2.7
Scipy version is: 1.12.0
Xgboost version is: 2.0.3
In [2]:
#Alter ast_note_interactivity kernel option so jupyter displays multiple statements at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Import all modules/libraries which will be loaded within this project

In [3]:
import itertools
from time import time
from pprint import pprint
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV,  RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import tree
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, auc, classification_report, confusion_matrix
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import cross_val_score, StratifiedKFold
##Importing hyperopt libraries
from functools import partial
from hyperopt import fmin, hp, tpe, Trials, space_eval
from hyperopt.pyll import scope

Section 2: Data Analysis¶


2.1 Data Exploration¶
In [4]:
# 2.1 Data Exploration

# Get the current working directory
current_directory = os.getcwd()

# Construct the file path
file_path = os.path.join(current_directory, 'data', 'oasis_longitudinal_demographics.csv')

# Load in the data
mri = pd.read_csv(file_path)

# Get a feel for the data - observe first 5 rows
print("First 5 rows of the data:")
print(mri.head(5))

# Print out the number of columns in the dataframe (features)
num_columns = len(mri.columns)
print(f"\nThere are {num_columns} attributes within the dataset.")

# Check the columns of the dataset in prep for Data Exploration
print("\nColumns of the dataset:")
print(mri.columns)
First 5 rows of the data:
  Subject ID         MRI ID        Group  Visit  MR Delay M/F Hand  Age  EDUC  \
0  OAS2_0001  OAS2_0001_MR1  Nondemented      1         0   M    R   87    14   
1  OAS2_0001  OAS2_0001_MR2  Nondemented      2       457   M    R   88    14   
2  OAS2_0002  OAS2_0002_MR1     Demented      1         0   M    R   75    12   
3  OAS2_0002  OAS2_0002_MR2     Demented      2       560   M    R   76    12   
4  OAS2_0002  OAS2_0002_MR3     Demented      3      1895   M    R   80    12   

   SES  MMSE  CDR         eTIV   nWBV    ASF  
0  2.0  27.0  0.0  1986.550000  0.696  0.883  
1  2.0  30.0  0.0  2004.479526  0.681  0.876  
2  NaN  23.0  0.5  1678.290000  0.736  1.046  
3  NaN  28.0  0.5  1737.620000  0.713  1.010  
4  NaN  22.0  0.5  1697.911134  0.701  1.034  

There are 15 attributes within the dataset.

Columns of the dataset:
Index(['Subject ID', 'MRI ID', 'Group', 'Visit', 'MR Delay', 'M/F', 'Hand',
       'Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'],
      dtype='object')
In [5]:
# Check the columns and their type - info prints this info about a dataframe
print("Information about the dataset:")
mri.info()

# Dimensions of the dataset
num_columns = len(mri.columns)
num_rows = len(mri.index)
print(f"\nThe dataset is made up with {num_columns} columns and {num_rows} rows.")
Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject ID  373 non-null    object 
 1   MRI ID      373 non-null    object 
 2   Group       373 non-null    object 
 3   Visit       373 non-null    int64  
 4   MR Delay    373 non-null    int64  
 5   M/F         373 non-null    object 
 6   Hand        373 non-null    object 
 7   Age         373 non-null    int64  
 8   EDUC        373 non-null    int64  
 9   SES         354 non-null    float64
 10  MMSE        371 non-null    float64
 11  CDR         373 non-null    float64
 12  eTIV        373 non-null    float64
 13  nWBV        373 non-null    float64
 14  ASF         373 non-null    float64
dtypes: float64(6), int64(4), object(5)
memory usage: 43.8+ KB

The dataset is made up with 15 columns and 373 rows.
In [6]:
# Let's check the distinct values in certain columns
# Common sense would suggest that Subject ID, MRI ID, MR Delay, eTIV, nWBV, and ASF features would have too many distinct values for the purpose of this test.
# Therefore let's ignore them

num_columns = ['Visit', 'Age', 'EDUC', 'SES', 'MMSE', 'CDR']

for col in num_columns:
    unique_values = mri[col].unique()
    print(f"{len(unique_values)} unique values for {col}:")
    print(unique_values)
    print("\nValue counts:")
    print(mri[col].value_counts(), "\n")

cat_columns = ['Group', 'M/F', 'Hand']
for col in cat_columns:
    unique_values = mri[col].unique()
    print(f"{len(unique_values)} unique values for {col}:")
    print(unique_values)
    print("\nValue counts:")
    print(mri[col].value_counts(), "\n")
5 unique values for Visit:
[1 2 3 4 5]

Value counts:
Visit
1    150
2    144
3     58
4     15
5      6
Name: count, dtype: int64 

39 unique values for Age:
[87 88 75 76 80 90 83 85 71 73 93 95 68 69 66 78 81 82 77 86 92 84 72 61
 64 74 60 62 91 79 89 70 94 97 65 67 63 96 98]

Value counts:
Age
73    26
75    22
78    21
80    20
81    18
71    18
82    17
76    16
77    16
68    14
84    13
69    13
83    12
70    12
74    12
72    11
79    11
66    10
88    10
86    10
85     9
89     7
65     6
67     6
87     6
90     5
62     4
91     4
61     4
92     4
64     3
93     3
63     3
60     2
95     1
94     1
97     1
96     1
98     1
Name: count, dtype: int64 

12 unique values for EDUC:
[14 12 18 16  8 20 13  6 17 15 23 11]

Value counts:
EDUC
12    103
16     81
18     64
14     33
13     27
15     17
20     13
11     11
8       9
17      9
6       3
23      3
Name: count, dtype: int64 

6 unique values for SES:
[ 2. nan  3.  4.  1.  5.]

Value counts:
SES
2.0    103
1.0     88
3.0     82
4.0     74
5.0      7
Name: count, dtype: int64 

19 unique values for MMSE:
[27. 30. 23. 28. 22. 29. 24. 21. 16. 25. 26. 15. 20. 19.  7.  4. 17. 18.
 nan]

Value counts:
MMSE
30.0    114
29.0     91
28.0     45
27.0     32
26.0     20
25.0     12
21.0     11
23.0     11
22.0      7
20.0      7
17.0      5
24.0      4
16.0      3
19.0      3
15.0      2
18.0      2
7.0       1
4.0       1
Name: count, dtype: int64 

4 unique values for CDR:
[0.  0.5 1.  2. ]

Value counts:
CDR
0.0    206
0.5    123
1.0     41
2.0      3
Name: count, dtype: int64 

3 unique values for Group:
['Nondemented' 'Demented' 'Converted']

Value counts:
Group
Nondemented    190
Demented       146
Converted       37
Name: count, dtype: int64 

2 unique values for M/F:
['M' 'F']

Value counts:
M/F
F    213
M    160
Name: count, dtype: int64 

1 unique values for Hand:
['R']

Value counts:
Hand
R    373
Name: count, dtype: int64 

In [7]:
#Get a summary of the numerical attributes within our dataset - let's us see if we need to normalize/scale at a later point
for col in num_columns:
    mri[col].describe()
Out[7]:
count    373.000000
mean       1.882038
std        0.922843
min        1.000000
25%        1.000000
50%        2.000000
75%        2.000000
max        5.000000
Name: Visit, dtype: float64
Out[7]:
count    373.000000
mean      77.013405
std        7.640957
min       60.000000
25%       71.000000
50%       77.000000
75%       82.000000
max       98.000000
Name: Age, dtype: float64
Out[7]:
count    373.000000
mean      14.597855
std        2.876339
min        6.000000
25%       12.000000
50%       15.000000
75%       16.000000
max       23.000000
Name: EDUC, dtype: float64
Out[7]:
count    354.000000
mean       2.460452
std        1.134005
min        1.000000
25%        2.000000
50%        2.000000
75%        3.000000
max        5.000000
Name: SES, dtype: float64
Out[7]:
count    371.000000
mean      27.342318
std        3.683244
min        4.000000
25%       27.000000
50%       29.000000
75%       30.000000
max       30.000000
Name: MMSE, dtype: float64
Out[7]:
count    373.000000
mean       0.290885
std        0.374557
min        0.000000
25%        0.000000
50%        0.000000
75%        0.500000
max        2.000000
Name: CDR, dtype: float64
2.2 Exploratory Data Analysis¶
In [8]:
# Plotting histograms side by side for pairs of variables

# Define pairs of variables
pairs = [('Age', 'CDR'), ('SES', 'EDUC'), ('MMSE', 'M/F')]

# Plot histograms for each pair of variables
for pair in pairs:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    for i, col in enumerate(pair):
        sns.histplot(mri[col], ax=axes[i], kde=False)
        axes[i].set_title(f'Histogram of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
    plt.tight_layout()
# Adding ; to last statement to get rid of <matplotlib...> <seaborn...> text
plt.show();
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

A few things can be concluded from these graphs about the dataset:

  • It seems that the age of the majority of Subject IDs falls between 70 - 85 with the most frequent age being 73 years old (more than 25 participants in this study are of this age). Note this could be skewed as we have not yet discounted Subject IDs that have had more than one session.
  • More than half of the datasets' population received a CDR rating of 0 (>200 Subject IDs) while less than 10 subjects were diagnosed with a 2.0 CDR score.
  • The most frequent Socio-Economic-status score in this study was 2.0 with over > 100 subject IDs (103), followed by > 85 subject IDs graded as 1.0 on the SES scale.
  • Most of the individuals in this study undertook a significant amount of years in Education with over 2/3s of the study having at least studied 12+ years of Education.
  • The majority of Subject IDs scored highly in their MMSE examinations (> 50% scoring 29 or above).
  • The percentage ratio between females and males was 57:43.

NOTE:

Although i will be replacing the 'Converted' values in the group attribute, for the purpose of visual analysis, I am dropping the rows for 'Converted' patients so i can directly compare 'demented' and 'non-demented' subjects. To accomodate this, I am taking a copy of the 'mri' dataframe to ensure we dont lose values.

Let's create a function called plot_facetgrid which will generate FacetGrid plots based on specified variables, using predefined colors, titles, and x-axis limits. It utilizes Seaborn's FacetGrid and kdeplot functions for visualization:

In [9]:
# Create a copy of the dataframe so we can remove the 'converted' value from the 'Group' attribute
mri_copy = mri.copy()

mri_copy =mri_copy[mri_copy.Group != "Converted"]

def plot_facetgrid(data, var):
    """
    Function to create a FacetGrid chart based on the variable var.
    
    Parameters:
        data (DataFrame): The DataFrame containing the data.
        var (str): The variable to plot.
    """
    csfont_options = { 'fontname': 'monospace',
                   'weight': 'bold',
                   'color':'b',
                   'size': 'large'
                  }
    
    plot_colors_lookup = {
        'MMSE': {'color': ['r', 'k'], 'title': 'Dementia Variation against MMSE scores', 'xlim': (10, data['MMSE'].max())},
        'SES': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Socio-Economic-Status', 'xlim': (0, data['SES'].max())},
        'nWBV': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Normalized Whole Brain Volume', 'xlim': (0, data['nWBV'].max())},
        'eTIV': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Estimated Total Intracranial volume', 'xlim': (0, data['eTIV'].max())},
        'EDUC': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Years of Education', 'xlim': (3, data['EDUC'].max())},
        'Age': {'color': ['r', 'k'], 'title': 'Dementia Variation against Age', 'xlim': (10, data['Age'].max())}
    }
    
    # Lookup plot colors, title, and xlim based on the variable var
    plot_colors = plot_colors_lookup[var]['color']
    title = plot_colors_lookup[var]['title']
    xlim = plot_colors_lookup[var]['xlim']
    
    # Create FacetGrid
    g = sns.FacetGrid(data, hue='Group', hue_kws={'color': plot_colors}, height=8)
    
    # Plot KDE plot
    g.map(sns.kdeplot, var, shade=True)
    
    # Set x-axis limit
    g.set(xlim=xlim)
    
    # Add legend
    plt.legend(loc='best')
    
    # Set title
    plt.title(title, **csfont_options)
    
    # Show plot
    plt.show()
In [10]:
plot_facetgrid(mri_copy, 'MMSE')
No description has been provided for this image

From the above graph, one can see the Dementia distribution as a function of MMSE score. For high MMSE scores, in approximately 70% of cases, the subject IDs are generally considered as non-demented . Finally, as expected, for low MMSE scores (<22) all subjects receive a 'demented' diagnosis.

In [11]:
plot_facetgrid(mri_copy, 'SES')
No description has been provided for this image

Interestingly, the distribution of Dementia across the study is greatest for subjects who have receive a score in the socio-economic-status scale of 3 or more. This possibly suggests that the more afluent an individual is, the greater the chance of Dementia. Subjects who generally score 3 or less are considered 'Non-Demented' (in >50% of cases for a score of 2) or 'Converted' (in ~60% of cases where subjects score 1 in the SES scale)

In [12]:
plot_facetgrid(mri_copy, 'nWBV')
No description has been provided for this image

As one would expect, subjects who exhibited a greater percentage of normalized whole brain volume, had a greater chance of receiving a 'Non-demented' diagnosis. The lower the nWBV value, the greater the chance a patient was classified as 'demented' or 'converted'. In recent years, brain shrinkage has become a potential important marker for early changes in the brain tissue where such symptoms have been associated as early markers for ailments of Alzheimer's [https://www.webmd.com/alzheimers/news/20110413/brain-shrinkage-may-help-predict-alzheimers].

In [13]:
plot_facetgrid(mri_copy, 'eTIV')
No description has been provided for this image

Interestingly, there didn't seem to be much difference between the variation of Dementia with ETIV i.e. the estimated total intracranial volume didn't vary much over time and therefore ETIV was not significantly different between any of classification groups. As referenced in the Paper "Inracranial Volume and Alzheimer Disease: Evidence agaisnt the cerebal Reserve Hypothesis" by Jenkins et al, the Mean TIV did not differ significantly between subjects and that the only significant predictor of TIV was sex. Thus it was concluded that their findings do not suggest that a larger brain provides better 'protection' against AD [https://www.ncbi.nlm.nih.gov/pubmed/10681081]. This explains the visual cues seen in the plot above .i.e 'Demented' and 'nondemented' subjects have relatively the same ETIV values.

In [14]:
plot_facetgrid(mri_copy, 'EDUC')
No description has been provided for this image

It seems from quickly glancing at the above, Dementia is more prevelant in subjects who had fewer years in Education. For e.g. any subject who had less than 13 years of Education had a slightly increased possibility of developing Dementia where as the chances of having Dementia-like symptoms (on first visit) decreases quite sharply for subjects who have had 17+ years in Education.

Finally, Dementia is more prevelant in subjects who fall within the 65 - 85 year group. Before 65, one could hypothesise that the subject is too young for the full blown effects of Dementia to manifest. Similarly, after 85, given that a patient could have been living with the disease for many years,certain symptoms could have had a profound effect on an individual's life. This possibly explains why the dementia (not converted) diagnosis is small after the years of 85. Obviously, Dementia forms in people at different stages, as shown by the converted plot where it gets larger in magnitude as ages increase (up to ~85 years old). This would suggest that a subject's condition has been updated from a 'non-demented' to 'demented' status in that timeframe. For the purpose of visual analysis and for reasons of brevity, I'm only discussing the direct comparisons between the 'demented' and 'non-demented' plots.

In [15]:
plot_facetgrid(mri_copy, 'Age')
No description has been provided for this image

If you want to see the 'Converted' group plotted against both 'demented' & 'non-demented' groups:

In [16]:
# Define the target variable and features
targ = 'Group'
cols = ['MMSE', 'SES', 'nWBV', 'eTIV', 'EDUC', 'Age']

# Define font options for the title
csfont_options = {'fontname': 'monospace', 'weight': 'bold', 'color': 'b', 'size': 'large'}

# Set min x-axis values for each feature
xlimits = [12, 0, 0.55, 840, 3, 50]

# Define plot colors for different groups
plot_colors = {'Converted': 'r', 'Demented': 'k', 'Non-Demented': 'b'}

# Iterate over the target variable and features
for feature, xlimit in zip(cols, xlimits):
    g = sns.FacetGrid(mri, hue=targ, height=8)
    g.map(sns.kdeplot, feature, shade=True)
    g.set(xlim=(10, mri[feature].max()))
    plt.legend(loc='best')
    plt.title('Dementia Variation against {}'.format(feature), **csfont_options)
    plt.xlim(xlimit)

plt.show();
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Out of curiousity, plot the frequency gender for Demented/Non-Demented subjects:

In [17]:
g = sns.catplot(x='M/F', 
                data=mri_copy, hue='Group', 
                kind='count', palette="muted", 
                legend=False, height=6)

g.despine(left=True)
g.set_ylabels("Participation count")
plt.legend(loc='best')
plt.show();
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No description has been provided for this image
2.2.2 Correlation¶
In [18]:
def plot_corr_heatmap(data, method, heading):
    """
    Plot correlation heatmap for the given data.

    Parameters:
        data (DataFrame): The DataFrame containing the data.
        method (str): The method used for computing correlations.
        heading (str): The title for the heatmap.
    """
    # Copy mri dataset and preprocess
    mri_copy = data.drop(['Subject ID', 'M/F', 'MRI ID', 'Hand'], axis=1)
    mri_copy = pd.get_dummies(mri_copy, drop_first=True)

    # Compute correlation matrix
    corr = mri_copy.corr(method=method)

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(12, 8))

    # Draw the heatmap
    sns.heatmap(corr, annot=True, linewidths=.5, fmt='.2f', ax=ax)

    # Set title
    ax.set_title(heading)

    # Show plot
    plt.show()

    return corr

plot_corr_heatmap(mri, 'pearson', 'Correlation Heat map')
No description has been provided for this image
Out[18]:
Visit MR Delay Age EDUC SES MMSE CDR eTIV nWBV ASF Group_Demented Group_Nondemented
Visit 1.000000 0.920009 0.183213 0.024615 -0.051622 -0.029078 0.002325 0.117423 -0.126682 -0.120399 -0.129800 0.095507
MR Delay 0.920009 1.000000 0.205357 0.051630 -0.030813 0.065844 -0.062915 0.119641 -0.105586 -0.123545 -0.180156 0.120638
Age 0.183213 0.205357 1.000000 -0.027886 -0.046857 0.055612 -0.026257 0.042401 -0.518359 -0.035067 -0.079153 0.005941
EDUC 0.024615 0.051630 -0.027886 1.000000 -0.722647 0.194884 -0.153121 0.257042 -0.012200 -0.241752 -0.258708 0.193060
SES -0.051622 -0.030813 -0.046857 -0.722647 1.000000 -0.149219 0.076160 -0.261582 0.090095 0.255576 0.205556 -0.062463
MMSE -0.029078 0.065844 0.055612 0.194884 -0.149219 1.000000 -0.686519 -0.032088 0.341912 0.040052 -0.612448 0.524775
CDR 0.002325 -0.062915 -0.026257 -0.153121 0.076160 -0.686519 1.000000 0.022863 -0.344819 -0.029340 0.815473 -0.778049
eTIV 0.117423 0.119641 0.042401 0.257042 -0.261582 -0.032088 0.022863 1.000000 -0.210180 -0.988905 -0.010365 0.042579
nWBV -0.126682 -0.105586 -0.518359 -0.012200 0.090095 0.341912 -0.344819 -0.210180 1.000000 0.213476 -0.286903 0.311346
ASF -0.120399 -0.123545 -0.035067 -0.241752 0.255576 0.040052 -0.029340 -0.988905 0.213476 1.000000 0.008312 -0.032495
Group_Demented -0.129800 -0.180156 -0.079153 -0.258708 0.205556 -0.612448 0.815473 -0.010365 -0.286903 0.008312 1.000000 -0.817174
Group_Nondemented 0.095507 0.120638 0.005941 0.193060 -0.062463 0.524775 -0.778049 0.042579 0.311346 -0.032495 -0.817174 1.000000

Observing the correlation coefficient (pearsons r) between each pair of attributes, I can make several assumptions:

  • CDR has a strong positive correlation with both Group_Demented and Group_Nondemented. This is expected given that the CDR scale is used to gauge whether a patient has symptoms of Dementia. Given that this feature is strongly correlated with the target attribute, it would be advisable to drop this column during feature selection.
  • Age and nWBV have quite a strong negative correlation.
  • EDUC has quite strong correlations with MMSE, eTIV and a v.strong negative correlation with SES
  • MMSE and CDR share a strong -ive correlation , quite strong correlation with nWBV.
  • Most of the attributes share a correlation that is close to 0 thus we assume that there is little to no linear correlation

Note, these **correlations are only linear and as a result we arent measuring non-linear relationships that could exist within the dataset (if attribute x is close to 0, y goes up) **

2.2.3 Scatter Matrix¶
In [19]:
scatter_matrix(mri[mri.columns], figsize=(12,8));
No description has been provided for this image

Section 3: Data preprocessing¶


3.1 Categorical Attribute replacement¶
In [20]:
# Display unique values before and after replacement
for col in ['Group', 'M/F']:
    print(f"Before replacement {col} has the following values: {mri[col].unique()}")

# Create a mapping dictionary for replacement
mapping = {'F': 0, 'M': 1, 'Nondemented': 0, 'Demented': 1, 'Converted': 1}

# Replace categorical values with numerical values
mri.replace({'Group': mapping, 'M/F': mapping}, inplace=True)

# Display unique values after replacement
for col in ['Group', 'M/F']:
    print(f"After replacement {col} has the following numerical values: {mri[col].unique()}")
Before replacement Group has the following values: ['Nondemented' 'Demented' 'Converted']
Before replacement M/F has the following values: ['M' 'F']
After replacement Group has the following numerical values: [0 1]
After replacement M/F has the following numerical values: [1 0]
In [21]:
## Using LabelEncoder from scikit learn to encode categorical features to numerical.
obj_mri = mri.select_dtypes(include=['object'])

# Converting categorical Data 
for col in obj_mri.columns:
    le = LabelEncoder()
    le.fit(mri[col])
    list(le.classes_)
    mri[col] = le.transform(mri[col])
Out[21]:
LabelEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
Out[21]:
['OAS2_0001',
 'OAS2_0002',
 'OAS2_0004',
 'OAS2_0005',
 'OAS2_0007',
 'OAS2_0008',
 'OAS2_0009',
 'OAS2_0010',
 'OAS2_0012',
 'OAS2_0013',
 'OAS2_0014',
 'OAS2_0016',
 'OAS2_0017',
 'OAS2_0018',
 'OAS2_0020',
 'OAS2_0021',
 'OAS2_0022',
 'OAS2_0023',
 'OAS2_0026',
 'OAS2_0027',
 'OAS2_0028',
 'OAS2_0029',
 'OAS2_0030',
 'OAS2_0031',
 'OAS2_0032',
 'OAS2_0034',
 'OAS2_0035',
 'OAS2_0036',
 'OAS2_0037',
 'OAS2_0039',
 'OAS2_0040',
 'OAS2_0041',
 'OAS2_0042',
 'OAS2_0043',
 'OAS2_0044',
 'OAS2_0045',
 'OAS2_0046',
 'OAS2_0047',
 'OAS2_0048',
 'OAS2_0049',
 'OAS2_0050',
 'OAS2_0051',
 'OAS2_0052',
 'OAS2_0053',
 'OAS2_0054',
 'OAS2_0055',
 'OAS2_0056',
 'OAS2_0057',
 'OAS2_0058',
 'OAS2_0060',
 'OAS2_0061',
 'OAS2_0062',
 'OAS2_0063',
 'OAS2_0064',
 'OAS2_0066',
 'OAS2_0067',
 'OAS2_0068',
 'OAS2_0069',
 'OAS2_0070',
 'OAS2_0071',
 'OAS2_0073',
 'OAS2_0075',
 'OAS2_0076',
 'OAS2_0077',
 'OAS2_0078',
 'OAS2_0079',
 'OAS2_0080',
 'OAS2_0081',
 'OAS2_0085',
 'OAS2_0086',
 'OAS2_0087',
 'OAS2_0088',
 'OAS2_0089',
 'OAS2_0090',
 'OAS2_0091',
 'OAS2_0092',
 'OAS2_0094',
 'OAS2_0095',
 'OAS2_0096',
 'OAS2_0097',
 'OAS2_0098',
 'OAS2_0099',
 'OAS2_0100',
 'OAS2_0101',
 'OAS2_0102',
 'OAS2_0103',
 'OAS2_0104',
 'OAS2_0105',
 'OAS2_0106',
 'OAS2_0108',
 'OAS2_0109',
 'OAS2_0111',
 'OAS2_0112',
 'OAS2_0113',
 'OAS2_0114',
 'OAS2_0116',
 'OAS2_0117',
 'OAS2_0118',
 'OAS2_0119',
 'OAS2_0120',
 'OAS2_0121',
 'OAS2_0122',
 'OAS2_0124',
 'OAS2_0126',
 'OAS2_0127',
 'OAS2_0128',
 'OAS2_0129',
 'OAS2_0131',
 'OAS2_0133',
 'OAS2_0134',
 'OAS2_0135',
 'OAS2_0137',
 'OAS2_0138',
 'OAS2_0139',
 'OAS2_0140',
 'OAS2_0141',
 'OAS2_0142',
 'OAS2_0143',
 'OAS2_0144',
 'OAS2_0145',
 'OAS2_0146',
 'OAS2_0147',
 'OAS2_0149',
 'OAS2_0150',
 'OAS2_0152',
 'OAS2_0154',
 'OAS2_0156',
 'OAS2_0157',
 'OAS2_0158',
 'OAS2_0159',
 'OAS2_0160',
 'OAS2_0161',
 'OAS2_0162',
 'OAS2_0164',
 'OAS2_0165',
 'OAS2_0169',
 'OAS2_0171',
 'OAS2_0172',
 'OAS2_0174',
 'OAS2_0175',
 'OAS2_0176',
 'OAS2_0177',
 'OAS2_0178',
 'OAS2_0179',
 'OAS2_0181',
 'OAS2_0182',
 'OAS2_0183',
 'OAS2_0184',
 'OAS2_0185',
 'OAS2_0186']
Out[21]:
LabelEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
Out[21]:
['OAS2_0001_MR1',
 'OAS2_0001_MR2',
 'OAS2_0002_MR1',
 'OAS2_0002_MR2',
 'OAS2_0002_MR3',
 'OAS2_0004_MR1',
 'OAS2_0004_MR2',
 'OAS2_0005_MR1',
 'OAS2_0005_MR2',
 'OAS2_0005_MR3',
 'OAS2_0007_MR1',
 'OAS2_0007_MR3',
 'OAS2_0007_MR4',
 'OAS2_0008_MR1',
 'OAS2_0008_MR2',
 'OAS2_0009_MR1',
 'OAS2_0009_MR2',
 'OAS2_0010_MR1',
 'OAS2_0010_MR2',
 'OAS2_0012_MR1',
 'OAS2_0012_MR2',
 'OAS2_0012_MR3',
 'OAS2_0013_MR1',
 'OAS2_0013_MR2',
 'OAS2_0013_MR3',
 'OAS2_0014_MR1',
 'OAS2_0014_MR2',
 'OAS2_0016_MR1',
 'OAS2_0016_MR2',
 'OAS2_0017_MR1',
 'OAS2_0017_MR3',
 'OAS2_0017_MR4',
 'OAS2_0017_MR5',
 'OAS2_0018_MR1',
 'OAS2_0018_MR3',
 'OAS2_0018_MR4',
 'OAS2_0020_MR1',
 'OAS2_0020_MR2',
 'OAS2_0020_MR3',
 'OAS2_0021_MR1',
 'OAS2_0021_MR2',
 'OAS2_0022_MR1',
 'OAS2_0022_MR2',
 'OAS2_0023_MR1',
 'OAS2_0023_MR2',
 'OAS2_0026_MR1',
 'OAS2_0026_MR2',
 'OAS2_0027_MR1',
 'OAS2_0027_MR2',
 'OAS2_0027_MR3',
 'OAS2_0027_MR4',
 'OAS2_0028_MR1',
 'OAS2_0028_MR2',
 'OAS2_0029_MR1',
 'OAS2_0029_MR2',
 'OAS2_0030_MR1',
 'OAS2_0030_MR2',
 'OAS2_0031_MR1',
 'OAS2_0031_MR2',
 'OAS2_0031_MR3',
 'OAS2_0032_MR1',
 'OAS2_0032_MR2',
 'OAS2_0034_MR1',
 'OAS2_0034_MR2',
 'OAS2_0034_MR3',
 'OAS2_0034_MR4',
 'OAS2_0035_MR1',
 'OAS2_0035_MR2',
 'OAS2_0036_MR1',
 'OAS2_0036_MR3',
 'OAS2_0036_MR4',
 'OAS2_0036_MR5',
 'OAS2_0037_MR1',
 'OAS2_0037_MR2',
 'OAS2_0037_MR3',
 'OAS2_0037_MR4',
 'OAS2_0039_MR1',
 'OAS2_0039_MR2',
 'OAS2_0040_MR1',
 'OAS2_0040_MR2',
 'OAS2_0040_MR3',
 'OAS2_0041_MR1',
 'OAS2_0041_MR2',
 'OAS2_0041_MR3',
 'OAS2_0042_MR1',
 'OAS2_0042_MR2',
 'OAS2_0043_MR1',
 'OAS2_0043_MR2',
 'OAS2_0044_MR1',
 'OAS2_0044_MR2',
 'OAS2_0044_MR3',
 'OAS2_0045_MR1',
 'OAS2_0045_MR2',
 'OAS2_0046_MR1',
 'OAS2_0046_MR2',
 'OAS2_0047_MR1',
 'OAS2_0047_MR2',
 'OAS2_0048_MR1',
 'OAS2_0048_MR2',
 'OAS2_0048_MR3',
 'OAS2_0048_MR4',
 'OAS2_0048_MR5',
 'OAS2_0049_MR1',
 'OAS2_0049_MR2',
 'OAS2_0049_MR3',
 'OAS2_0050_MR1',
 'OAS2_0050_MR2',
 'OAS2_0051_MR1',
 'OAS2_0051_MR2',
 'OAS2_0051_MR3',
 'OAS2_0052_MR1',
 'OAS2_0052_MR2',
 'OAS2_0053_MR1',
 'OAS2_0053_MR2',
 'OAS2_0054_MR1',
 'OAS2_0054_MR2',
 'OAS2_0055_MR1',
 'OAS2_0055_MR2',
 'OAS2_0056_MR1',
 'OAS2_0056_MR2',
 'OAS2_0057_MR1',
 'OAS2_0057_MR2',
 'OAS2_0057_MR3',
 'OAS2_0058_MR1',
 'OAS2_0058_MR2',
 'OAS2_0058_MR3',
 'OAS2_0060_MR1',
 'OAS2_0060_MR2',
 'OAS2_0061_MR1',
 'OAS2_0061_MR2',
 'OAS2_0061_MR3',
 'OAS2_0062_MR1',
 'OAS2_0062_MR2',
 'OAS2_0062_MR3',
 'OAS2_0063_MR1',
 'OAS2_0063_MR2',
 'OAS2_0064_MR1',
 'OAS2_0064_MR2',
 'OAS2_0064_MR3',
 'OAS2_0066_MR1',
 'OAS2_0066_MR2',
 'OAS2_0067_MR1',
 'OAS2_0067_MR2',
 'OAS2_0067_MR3',
 'OAS2_0067_MR4',
 'OAS2_0068_MR1',
 'OAS2_0068_MR2',
 'OAS2_0069_MR1',
 'OAS2_0069_MR2',
 'OAS2_0070_MR1',
 'OAS2_0070_MR2',
 'OAS2_0070_MR3',
 'OAS2_0070_MR4',
 'OAS2_0070_MR5',
 'OAS2_0071_MR1',
 'OAS2_0071_MR2',
 'OAS2_0073_MR1',
 'OAS2_0073_MR2',
 'OAS2_0073_MR3',
 'OAS2_0073_MR4',
 'OAS2_0073_MR5',
 'OAS2_0075_MR1',
 'OAS2_0075_MR2',
 'OAS2_0076_MR1',
 'OAS2_0076_MR2',
 'OAS2_0076_MR3',
 'OAS2_0077_MR1',
 'OAS2_0077_MR2',
 'OAS2_0078_MR1',
 'OAS2_0078_MR2',
 'OAS2_0078_MR3',
 'OAS2_0079_MR1',
 'OAS2_0079_MR2',
 'OAS2_0079_MR3',
 'OAS2_0080_MR1',
 'OAS2_0080_MR2',
 'OAS2_0080_MR3',
 'OAS2_0081_MR1',
 'OAS2_0081_MR2',
 'OAS2_0085_MR1',
 'OAS2_0085_MR2',
 'OAS2_0086_MR1',
 'OAS2_0086_MR2',
 'OAS2_0087_MR1',
 'OAS2_0087_MR2',
 'OAS2_0088_MR1',
 'OAS2_0088_MR2',
 'OAS2_0089_MR1',
 'OAS2_0089_MR3',
 'OAS2_0090_MR1',
 'OAS2_0090_MR2',
 'OAS2_0090_MR3',
 'OAS2_0091_MR1',
 'OAS2_0091_MR2',
 'OAS2_0092_MR1',
 'OAS2_0092_MR2',
 'OAS2_0094_MR1',
 'OAS2_0094_MR2',
 'OAS2_0095_MR1',
 'OAS2_0095_MR2',
 'OAS2_0095_MR3',
 'OAS2_0096_MR1',
 'OAS2_0096_MR2',
 'OAS2_0097_MR1',
 'OAS2_0097_MR2',
 'OAS2_0098_MR1',
 'OAS2_0098_MR2',
 'OAS2_0099_MR1',
 'OAS2_0099_MR2',
 'OAS2_0100_MR1',
 'OAS2_0100_MR2',
 'OAS2_0100_MR3',
 'OAS2_0101_MR1',
 'OAS2_0101_MR2',
 'OAS2_0101_MR3',
 'OAS2_0102_MR1',
 'OAS2_0102_MR2',
 'OAS2_0102_MR3',
 'OAS2_0103_MR1',
 'OAS2_0103_MR2',
 'OAS2_0103_MR3',
 'OAS2_0104_MR1',
 'OAS2_0104_MR2',
 'OAS2_0105_MR1',
 'OAS2_0105_MR2',
 'OAS2_0106_MR1',
 'OAS2_0106_MR2',
 'OAS2_0108_MR1',
 'OAS2_0108_MR2',
 'OAS2_0109_MR1',
 'OAS2_0109_MR2',
 'OAS2_0111_MR1',
 'OAS2_0111_MR2',
 'OAS2_0112_MR1',
 'OAS2_0112_MR2',
 'OAS2_0113_MR1',
 'OAS2_0113_MR2',
 'OAS2_0114_MR1',
 'OAS2_0114_MR2',
 'OAS2_0116_MR1',
 'OAS2_0116_MR2',
 'OAS2_0117_MR1',
 'OAS2_0117_MR2',
 'OAS2_0117_MR3',
 'OAS2_0117_MR4',
 'OAS2_0118_MR1',
 'OAS2_0118_MR2',
 'OAS2_0119_MR1',
 'OAS2_0119_MR2',
 'OAS2_0119_MR3',
 'OAS2_0120_MR1',
 'OAS2_0120_MR2',
 'OAS2_0121_MR1',
 'OAS2_0121_MR2',
 'OAS2_0122_MR1',
 'OAS2_0122_MR2',
 'OAS2_0124_MR1',
 'OAS2_0124_MR2',
 'OAS2_0126_MR1',
 'OAS2_0126_MR2',
 'OAS2_0126_MR3',
 'OAS2_0127_MR1',
 'OAS2_0127_MR2',
 'OAS2_0127_MR3',
 'OAS2_0127_MR4',
 'OAS2_0127_MR5',
 'OAS2_0128_MR1',
 'OAS2_0128_MR2',
 'OAS2_0129_MR1',
 'OAS2_0129_MR2',
 'OAS2_0129_MR3',
 'OAS2_0131_MR1',
 'OAS2_0131_MR2',
 'OAS2_0133_MR1',
 'OAS2_0133_MR3',
 'OAS2_0134_MR1',
 'OAS2_0134_MR2',
 'OAS2_0135_MR1',
 'OAS2_0135_MR2',
 'OAS2_0137_MR1',
 'OAS2_0137_MR2',
 'OAS2_0138_MR1',
 'OAS2_0138_MR2',
 'OAS2_0139_MR1',
 'OAS2_0139_MR2',
 'OAS2_0140_MR1',
 'OAS2_0140_MR2',
 'OAS2_0140_MR3',
 'OAS2_0141_MR1',
 'OAS2_0141_MR2',
 'OAS2_0142_MR1',
 'OAS2_0142_MR2',
 'OAS2_0143_MR1',
 'OAS2_0143_MR2',
 'OAS2_0143_MR3',
 'OAS2_0144_MR1',
 'OAS2_0144_MR2',
 'OAS2_0145_MR1',
 'OAS2_0145_MR2',
 'OAS2_0146_MR1',
 'OAS2_0146_MR2',
 'OAS2_0147_MR1',
 'OAS2_0147_MR2',
 'OAS2_0147_MR3',
 'OAS2_0147_MR4',
 'OAS2_0149_MR1',
 'OAS2_0149_MR2',
 'OAS2_0150_MR1',
 'OAS2_0150_MR2',
 'OAS2_0152_MR1',
 'OAS2_0152_MR2',
 'OAS2_0152_MR3',
 'OAS2_0154_MR1',
 'OAS2_0154_MR2',
 'OAS2_0156_MR1',
 'OAS2_0156_MR2',
 'OAS2_0157_MR1',
 'OAS2_0157_MR2',
 'OAS2_0158_MR1',
 'OAS2_0158_MR2',
 'OAS2_0159_MR1',
 'OAS2_0159_MR2',
 'OAS2_0160_MR1',
 'OAS2_0160_MR2',
 'OAS2_0161_MR1',
 'OAS2_0161_MR2',
 'OAS2_0161_MR3',
 'OAS2_0162_MR1',
 'OAS2_0162_MR2',
 'OAS2_0164_MR1',
 'OAS2_0164_MR2',
 'OAS2_0165_MR1',
 'OAS2_0165_MR2',
 'OAS2_0169_MR1',
 'OAS2_0169_MR2',
 'OAS2_0171_MR1',
 'OAS2_0171_MR2',
 'OAS2_0171_MR3',
 'OAS2_0172_MR1',
 'OAS2_0172_MR2',
 'OAS2_0174_MR1',
 'OAS2_0174_MR2',
 'OAS2_0174_MR3',
 'OAS2_0175_MR1',
 'OAS2_0175_MR2',
 'OAS2_0175_MR3',
 'OAS2_0176_MR1',
 'OAS2_0176_MR2',
 'OAS2_0176_MR3',
 'OAS2_0177_MR1',
 'OAS2_0177_MR2',
 'OAS2_0178_MR1',
 'OAS2_0178_MR2',
 'OAS2_0178_MR3',
 'OAS2_0179_MR1',
 'OAS2_0179_MR2',
 'OAS2_0181_MR1',
 'OAS2_0181_MR2',
 'OAS2_0181_MR3',
 'OAS2_0182_MR1',
 'OAS2_0182_MR2',
 'OAS2_0183_MR1',
 'OAS2_0183_MR2',
 'OAS2_0183_MR3',
 'OAS2_0183_MR4',
 'OAS2_0184_MR1',
 'OAS2_0184_MR2',
 'OAS2_0185_MR1',
 'OAS2_0185_MR2',
 'OAS2_0185_MR3',
 'OAS2_0186_MR1',
 'OAS2_0186_MR2',
 'OAS2_0186_MR3']
Out[21]:
LabelEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
Out[21]:
['R']
In [22]:
# Check that all columns are now numerical
mri.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject ID  373 non-null    int64  
 1   MRI ID      373 non-null    int64  
 2   Group       373 non-null    int64  
 3   Visit       373 non-null    int64  
 4   MR Delay    373 non-null    int64  
 5   M/F         373 non-null    int64  
 6   Hand        373 non-null    int64  
 7   Age         373 non-null    int64  
 8   EDUC        373 non-null    int64  
 9   SES         354 non-null    float64
 10  MMSE        371 non-null    float64
 11  CDR         373 non-null    float64
 12  eTIV        373 non-null    float64
 13  nWBV        373 non-null    float64
 14  ASF         373 non-null    float64
dtypes: float64(6), int64(9)
memory usage: 43.8 KB
3.1.2 Outlier detection¶
In [23]:
def outlier_detection(cols):
    """
    Detect outliers in a list of values using z-score method.
    
    Parameters:
        cols (list): A list of numerical values.
    
    Returns:
        outliers (tuple): A tuple containing the indices of outliers.
    """
    mean_col = np.mean(cols)
    std_col = np.std(cols)
    threshold = 3
    z_scores = [(col - mean_col) / std_col for col in cols]
    outliers = np.where(np.abs(z_scores) > threshold)
    return outliers
In [24]:
cols = ["EDUC", "SES", "MMSE", "CDR", "eTIV", "Visit", "eTIV", "nWBV", "ASF"]

for col in cols:
    outliers_indices = outlier_detection(mri[col])
    if len(outliers_indices[0]) > 0:
        print(f"Outliers for {col} are given below:")
        for i in outliers_indices[0]:
            outlier_value = mri[col].iloc[i]
            print(f"Index: {i}, Value: {outlier_value}")
        print()
Outliers for MMSE are given below:
Index: 26, Value: 16.0
Index: 89, Value: 15.0
Index: 100, Value: 7.0
Index: 101, Value: 4.0
Index: 172, Value: 16.0
Index: 173, Value: 16.0
Index: 251, Value: 15.0

Outliers for CDR are given below:
Index: 184, Value: 2.0
Index: 251, Value: 2.0
Index: 330, Value: 2.0

Outliers for Visit are given below:
Index: 32, Value: 5
Index: 71, Value: 5
Index: 101, Value: 5
Index: 153, Value: 5
Index: 160, Value: 5
Index: 265, Value: 5

The outliers for MMSE, CDR and Visit have actual 'meaning' ... i.e. MMSE scores of 16,15,7,4 etc are used to determine the severity of Dementia in a subject (same applies to CDR). Thus, I believe it wouldn't be suitable to drop these outliers from the dataset

3.1.3 Imputation - dealing with missing values¶
In [25]:
#Check which columns contain null values
nulls = mri.columns[mri.isnull().any()]
In [26]:
#Check how many nulls in each column
mri[nulls].isnull().sum()
Out[26]:
SES     19
MMSE     2
dtype: int64
In [27]:
# Imputation - only works on numerical attributes, hence why we encoded the categorical text features in 3.1
# Impute missing values with the median value of the attribute
imputer = SimpleImputer(strategy="median")

# Final check to make sure all columns are numerical before proceeding with imputation
for column in mri.columns:
    if mri[column].dtype == 'object':
        print(f"{column} of type object found!")
    else:
        print(f"{column} is a numerical attribute - proceed with imputation")

# Fitting imputer instance
imputer.fit(mri)

# Check if the median computed by the imputer matches the median of the original data
print("Medians match:", (imputer.statistics_ == mri.median().values).all())

# Transforming mri data to replace null values via imputation
X = imputer.transform(mri)

# Constructing dataframe from transformed values
mri = pd.DataFrame(X, columns=mri.columns)
Subject ID is a numerical attribute - proceed with imputation
MRI ID is a numerical attribute - proceed with imputation
Group is a numerical attribute - proceed with imputation
Visit is a numerical attribute - proceed with imputation
MR Delay is a numerical attribute - proceed with imputation
M/F is a numerical attribute - proceed with imputation
Hand is a numerical attribute - proceed with imputation
Age is a numerical attribute - proceed with imputation
EDUC is a numerical attribute - proceed with imputation
SES is a numerical attribute - proceed with imputation
MMSE is a numerical attribute - proceed with imputation
CDR is a numerical attribute - proceed with imputation
eTIV is a numerical attribute - proceed with imputation
nWBV is a numerical attribute - proceed with imputation
ASF is a numerical attribute - proceed with imputation
Out[27]:
SimpleImputer(strategy='median')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SimpleImputer(strategy='median')
Medians match: True
In [28]:
#Check that nulls have been replaced
mri[nulls].isnull().sum()
Out[28]:
SES     0
MMSE    0
dtype: int64
3.1.4 Standardizing dataframe attributes¶
In [29]:
cols = ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']

max_values = mri[cols].max()
min_values = mri[cols].min()

print(f"max value in mri dataset for {cols} before scaling:\n{max_values}")
print(f"min value in mri dataset for {cols} before scaling:\n{min_values}")
max value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling:
Age       98.000000
M/F        1.000000
EDUC      23.000000
SES        5.000000
MMSE      30.000000
eTIV    2004.479526
nWBV       0.837000
ASF        1.587000
dtype: float64
min value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling:
Age       60.000000
M/F        0.000000
EDUC       6.000000
SES        1.000000
MMSE       4.000000
eTIV    1105.652499
nWBV       0.644000
ASF        0.876000
dtype: float64
In [30]:
scaler = StandardScaler().fit(mri)
mri[cols]=scaler.fit_transform(mri[cols])
In [31]:
cols = ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']

max_values = mri[cols].max()
min_values = mri[cols].min()

print(f"max value in mri dataset for {cols} before scaling:\n{max_values}")
print(f"min value in mri dataset for {cols} before scaling:\n{min_values}")
max value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling:
Age     2.750282
M/F     1.153798
EDUC    2.925048
SES     2.313556
MMSE    0.721664
eTIV    2.935525
nWBV    2.896887
ASF     2.839157
dtype: float64
min value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling:
Age    -2.229597
M/F    -0.866703
EDUC   -2.993181
SES    -1.297140
MMSE   -6.362035
eTIV   -2.174359
nWBV   -2.307345
ASF    -2.316501
dtype: float64

Note: We only standardized certain attributes, as some of these columns will be dropped during feature selection in our next preprocessing step

3.1.5 Dropping features¶
In [32]:
# Drop unneccessary features and also the highly correlated feature CDR from the dataframe
mri = mri.drop(['Subject ID', 'Visit', 'MRI ID', 'Hand', 'MR Delay', 'CDR'], axis=1)
In [33]:
mri.head(10)
Out[33]:
Group M/F Age EDUC SES MMSE eTIV nWBV ASF
0 0.0 1.153798 1.308738 -0.208132 -0.394466 -0.095686 2.833595 -0.905169 -2.265742
1 0.0 1.153798 1.439787 -0.208132 -0.394466 0.721664 2.935525 -1.309643 -2.316501
2 1.0 1.153798 -0.263856 -0.904394 -0.394466 -1.185486 1.081119 0.173429 -1.083784
3 1.0 1.153798 -0.132806 -0.904394 -0.394466 0.176764 1.418413 -0.446765 -1.344830
4 1.0 1.153798 0.391392 -0.904394 -0.394466 -1.457936 1.192666 -0.770344 -1.170800
5 0.0 -0.866703 1.439787 1.184392 0.508208 0.176764 -1.550836 -0.527660 1.802224
6 0.0 -0.866703 1.701886 1.184392 0.508208 -0.095686 -1.637420 -0.311940 1.932747
7 0.0 1.153798 0.391392 -0.904394 1.410882 0.176764 1.139618 -0.473730 -1.134543
8 0.0 1.153798 0.784540 -0.904394 1.410882 0.449214 1.208652 -0.500695 -1.185302
9 0.0 1.153798 1.046639 -0.904394 1.410882 0.721664 1.200386 -0.662484 -1.178051
3.1.6 Splitting data into training / test sets¶
In [34]:
seed = np.random.seed(42)
y =  mri['Group']   #Target 
X = mri.drop(['Group'], axis=1) # Features

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#Create copies of our train/test sets for benchmark testing
BM_X_train = X_train.copy()
BM_X_test = X_test.copy()
BM_y_train = y_train.copy()
BM_y_test = y_test.copy()

#Create further copies incase of further optimization techniques
Copy_X_train = X_train.copy()
Copy_X_test = X_test.copy()
Copy_y_train = y_train.copy()
Copy_y_test = y_test.copy()
In [35]:
print("X_train set is made up of {} rows and {} columns\n".format(len(X_train.index), len(X_train.columns)))
print("X_test set is made up of {} rows and {} columns\n".format(len(X_test.index), len(X_test.columns)))
print("y_train is made up of {} values\n".format(len(y_train.index)))
print("y_test is made up of {} values\n".format(len(y_test.index)))
X_train set is made up of 261 rows and 8 columns

X_test set is made up of 112 rows and 8 columns

y_train is made up of 261 values

y_test is made up of 112 values

3.2 Data Implementation¶

Benchmark Statistics

In [36]:
## As per the evaluators suggestion, look into using xgboost  -> popular boosting model -> appropiate here
#setting random state as 42 for reproducability

# Define the models
models = [ 
    LogisticRegression(random_state=42),
    SVC(random_state=42),
    DecisionTreeClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    XGBClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    GaussianNB()
]

# Define the corresponding classifier names
classifiers = [
    'Logistic Regression',
    'SVC',
    'Decision Tree Classifier',
    'AdaBoost Classifier',
    'Random Forest Classifier',
    'XGBClassifier',
    'GradientBoostingClassifier',
    'Naive Bayes'
]

# Define the columns for the benchmark dataframe
ml_columns = ['ML Algo name', 'Model Parameters', 'training AUC accuracy', 'test recall score', 'test AUC score']

# Create an empty dataframe to store benchmark results
benchmark = pd.DataFrame(columns=ml_columns)

# Perform cross-validation and evaluation for each model
for model, clf_name in zip(models, classifiers):
    benchmark.loc[len(benchmark)] = [clf_name, str(model.get_params()), np.nan, np.nan, np.nan]
    cv_score = cross_val_score(model, BM_X_train, BM_y_train, cv=5, scoring='roc_auc') 
    benchmark.loc[benchmark['ML Algo name'] == clf_name, 'training AUC accuracy'] = np.mean(cv_score)
    mdl = model.fit(BM_X_train, BM_y_train)
    test_acc = mdl.score(BM_X_test, BM_y_test)
    fpr, tpr, thresholds = roc_curve(BM_y_test, model.predict(BM_X_test))
    test_recall = recall_score(BM_y_test, model.predict(BM_X_test))
    test_auc = auc(fpr, tpr)
    benchmark.loc[benchmark['ML Algo name'] == clf_name, 'test recall score'] = test_recall
    benchmark.loc[benchmark['ML Algo name'] == clf_name, 'test AUC score'] = test_auc 

# Sort benchmark table by test AUC score
benchmark.sort_values(by='test AUC score', ascending=False, inplace=True)

# Plot the benchmark results
sns.barplot(x='test AUC score', y='ML Algo name', data=benchmark)

# Set the width of the model parameters column
benchmark.style.set_properties(subset=['Model Parameters'], **{'width': '300px'})
Out[36]:
<Axes: xlabel='test AUC score', ylabel='ML Algo name'>
Out[36]:
  ML Algo name Model Parameters training AUC accuracy test recall score test AUC score
4 Random Forest Classifier {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False} 0.917676 0.833333 0.868590
6 GradientBoostingClassifier {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False} 0.895164 0.816667 0.821795
5 XGBClassifier {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None} 0.898903 0.766667 0.806410
1 SVC {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False} 0.887858 0.650000 0.776923
3 AdaBoost Classifier {'algorithm': 'SAMME.R', 'estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 'random_state': 42} 0.870394 0.783333 0.766667
2 Decision Tree Classifier {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': 42, 'splitter': 'best'} 0.771987 0.766667 0.748718
7 Naive Bayes {'priors': None, 'var_smoothing': 1e-09} 0.879456 0.650000 0.748077
0 Logistic Regression {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False} 0.907505 0.700000 0.744231
No description has been provided for this image

Plotting the roc_curves and confusion metrics for evaluation purposes ...

In [37]:
total_fpr = {}
total_tpr = {}

def roc_curves(model, classifier):
    ML_name = model.__class__.__name__
    fpr, tpr, thresholds = roc_curve(BM_y_test, model.predict(BM_X_test))
    test_auc = auc(fpr, tpr)
    total_fpr[ML_name] = fpr
    total_tpr[ML_name] = tpr
    plt.plot(fpr, tpr, color='darkorange', linewidth=2, label='(Area = %0.2f)' % test_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('ROC Curve for {}'.format(classifier))
    plt.show()

def confusion_matrix_plot(model, classifier):
    cm = confusion_matrix(y_test, model.predict(BM_X_test))
    plt.imshow(cm, interpolation='nearest', cmap=plt.get_cmap('Blues'))
    labels = ['Nondemented', 'Demented']
    plt.title('Confusion Matrix for {}'.format(classifier))
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels)
    plt.yticks(tick_marks, labels)
    s = [['TN', 'FP'], ['FN', 'TP']]
    for i, j in itertools.product(range(2), range(2)):
        plt.text(j, i, str(s[i][j]) + " = " + str(cm[i][j]))
    plt.show()

for model, classifier in zip(models, classifiers):
    roc_curves(model, classifier)
    confusion_matrix_plot(model, classifier)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Plotting all of the ROC curves on one graph ...

In [38]:
colors = {
    'LogisticRegression': 'red',
    'SVC': 'green',
    'DecisionTreeClassifier': 'blue',
    'AdaBoostClassifier': 'yellow',
    'RandomForestClassifier': 'cyan',
    'XGBClassifier': 'magenta',
    'GradientBoostingClassifier': 'black',
    'GaussianNB': 'white'
}

plt.figure(figsize=(20, 12))
for model_name in total_fpr.keys():
    plt.plot(total_fpr[model_name], total_tpr[model_name], color=colors[model_name], lw=1, label=model_name)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title('Comparison of ROC curves for all classifiers')
plt.plot([0, 1], [0, 1], color='darkorange', lw=2, linestyle='--')
plt.legend()
plt.show();
No description has been provided for this image
3.2.2 Implementation¶

Optimization techniques

I made the mistake of trying to perform GridSearchCV for all classifiers. As you can see in the markdown below, the hyperparameter tuning of the Random Forests Classifier and the decision tree classifier took approximately 135746 & 37069 seconds or in more explicit scaling, upwards of 38 / 10 hours.

Thus, I will only be performing GridSearchCV for LogisticRegression and SVM.

The remaining classifiers will be tuned using RandomizedSearchCV and hyperopt

GridSearch/RandomizedSearchCV¶

The below function 'grid' uses GridSearchCV to find optimal hyperparamters for the Logistic Regression and SVM Models. Due to the vast parameter grids for the remaining Models, it was advisable to tune their hyperparameters via RandomizedSearchCV.

Also, the 3 benchmark models (SVM, RandomForestClassifier,XGBoost) which i wanted to pay particular attention to,are also fitted for the evaluation metric 'accuracy'. Reasoning behind this - I wanted to collect the classification error on training/test sets ... the classification error being equal to (1 - accuracy_score)... to test the robustness of each model.

In [39]:
#Create empty final results dataframe for optimization metrics
refinement_cols = ['ML Model','Method','fitting time', 'optimal hyperparameters', 'best training AUC score', 'best estimator','test recall score', 'test AUC score']
finale = pd.DataFrame(columns = refinement_cols)

#Create empty robustness table to compare training and testing errors
robust_cols = ['ML Model', 'Training Classification Error', 'Test Classification_Error']
robustment_test = pd.DataFrame(columns = robust_cols)

#Storing training times
training_time = []
#Create two empty dictionaries so that the FPR and TPR can be captured for each model. This will allow us to plot all the ROC curves on one graph
total_fpr_opt = {} 
total_tpr_opt = {}



#Define classifiers for tuning - set random_state for reproducible results
classifiers = [ LogisticRegression(random_state=42),
                SVC(random_state=42),
                AdaBoostClassifier(random_state=42),
                DecisionTreeClassifier(random_state=42),
                RandomForestClassifier(random_state=42),
                GradientBoostingClassifier(random_state=42),
                XGBClassifier(random_state=42) ]



lr_parameters = { 
                  'C':[0.001, 0.01, 0.1, 1, 10,100,1000, 10000], 
                  'max_iter':list(range(2,100))}


svm_parameters = {
                  'kernel': ['rbf', 'poly', 'sigmoid'],
                  'C':[0.00001,0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 30, 40, 50 ,60 , 80, 100],
                  'gamma': [0.00001,0.0001, 0.001, 0.01, 0.1, 1, 10] }

dt_parameters = { #'max_leaf_nodes': list(range(2,100)),
                  'max_leaf_nodes': list(range(2,20)),
                  'splitter':['random', 'best'],
                  'criterion': ['gini', 'entropy'],
                  'max_depth': list(range(2,20)),
                  'min_samples_split': list(range(2,20))}

rf_parameters = { 
                  'n_estimators': [10,30,50,100,150,200,250,300,350,400,500,600,700,800,900,1000],
                  'max_leaf_nodes': list(range(2,30)),
                  'min_samples_leaf':[1,2,3], 'min_samples_split':[3,4,5,6,7]}

gb_parameters = {
                  'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                  'min_samples_leaf': list(range(1,20)),
                  'max_depth': list(range(2,20)),
                  'n_estimators': range(1,200)}

xgboost_parameters = { 
                  'silent': [True],
                  'max_depth': [6, 10, 15, 20],
                  'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
                  'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                  'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                  'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                  'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
                  'gamma': [0, 0.25, 0.5, 1.0],
                  'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
                  'n_estimators': [50,100,120,150]}


ada_parameters = { 
                   'n_estimators': [50,100,200,250,300,400,500,600,700,800,900,1000],
                   'learning_rate': [ 0.1,0.15, 0.2,0.25, 0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] }

parameters = [lr_parameters, 
              svm_parameters,
              ada_parameters, 
              dt_parameters,
              rf_parameters, 
              gb_parameters, 
              xgboost_parameters]  
                    
In [40]:
os.environ["PARALLEL_JOBS"] = "16"
In [41]:
def get_n_jobs():
    parallel_jobs = os.getenv('PARALLEL_JOBS')
    if parallel_jobs is not None:
        try:
            n_jobs = int(parallel_jobs)
            return n_jobs
        except ValueError:
            print("Error: PARALLEL_JOBS is not an integer. Using default value (None).")
            return None
    else:
        return None

parallel_jobs = get_n_jobs()

def print_summary(ML_name, mdl, method, fitting_time, best_params, confusion_matrix, complete_time,parallel_jobs,feature_importances=False):
    print("--" * 50)
    print("Finding the best hyperparameters for metric [roc_auc] for {}\n".format(ML_name))
    print("Performing {} for {}\n".format(method, ML_name))
    print("Beginning to fit {}\n".format(ML_name))
    print("Optimal hyperparameters:", best_params)
    print("Confusion matrix:")
    confusion_matrix_plot(confusion_matrix, ML_name)
    if feature_importances:
        print_feature_importances(ML_name, mdl)
    if parallel_jobs == None:
        parallel_jobs = 1
    print(f"Hyperparameter tuning complete. Used {parallel_jobs} parallel jobs... Took {complete_time:.2f} seconds.\n")

def print_feature_importances(ML_name, mdl):
    print("=="*16,"feature_importances","=="*16)
    for name, importance in zip(X_train.columns, mdl.feature_importances_):
        print(name, "==", importance)
    importances = mdl.feature_importances_
    indices = np.argsort(importances)
    features = X_train.columns
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importances')
    plt.show();

def confusion_matrix_plot(confusion_matrix, classifier):
    plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
    labels = ['Nondemented', 'Demented']
    plt.title('Confusion Matrix for {}'.format(classifier))
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels)
    plt.yticks(tick_marks, labels)
    s = [['TN', 'FP'], ['FN', 'TP']]
    for i, j in itertools.product(range(2), range(2)):
        plt.text(j, i, str(s[i][j]) + " = " + str(confusion_matrix[i][j]))
    plt.show()

def grid(classifier, parameters,jobs):
    ML_name = classifier.__class__.__name__
    start_time = time()
    models = ['DecisionTreeClassifier', 'RandomForestClassifier', 'XGBClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier']
    if ML_name in models:
        method = "Randomized Grid Search"
        grid = RandomizedSearchCV(estimator=classifier, param_distributions=parameters, cv=10, n_iter=100, scoring="roc_auc", random_state=42,n_jobs=jobs,verbose=0)
    else:
        method = "Grid Search"
        grid = GridSearchCV(estimator=classifier, param_grid=parameters, cv=10, scoring="roc_auc",n_jobs=jobs,verbose=0)
    grid.fit(X_train, y_train)
    fitting_time = time() - start_time
    best_params = grid.best_params_
    mdl = grid.best_estimator_.fit(X_train, y_train)
    pred = mdl.predict(X_test)
    test_acc = accuracy_score(y_test, pred)
    test_recall = recall_score(y_test, pred, pos_label=1)
    fpr,tpr,thresholds = roc_curve(y_test, pred, pos_label=1)
    test_auc = auc(fpr, tpr)
    total_fpr_opt[ML_name] = fpr  #Dit key
    total_tpr_opt[ML_name] = tpr #Dict key
    conf_matrix = confusion_matrix(y_test, pred)
    complete_time = time() - start_time
    # Feature importances if applicable
    feature_importances = None
    if ML_name in models:
        feature_importances = True

    # Inserting data into finale DataFrame
    finale.loc[index, 'ML Model'] = ML_name
    finale.loc[index, 'Method'] = method
    finale.loc[index, 'fitting time'] = fitting_time
    finale.loc[index, 'optimal hyperparameters'] = str(best_params)
    finale.loc[index, 'best training AUC score'] = grid.best_score_
    finale.loc[index, 'best estimator'] = str(mdl)
    # Inserting test recall score and test AUC score into finale DataFrame
    finale.loc[index, 'test recall score'] = test_recall
    finale.loc[index, 'test AUC score'] = test_auc
    print_summary(ML_name, mdl, method, fitting_time, best_params, conf_matrix, complete_time,jobs,feature_importances=feature_importances)
    # Robustness check for certain models
    robust_models = ['SVC', 'RandomForestClassifier', 'XGBClassifier']
    if ML_name in robust_models:
        print("------------- Calculating {} classification error for Robustness checks  -------------\n".format(ML_name))
        if ML_name in models:
            grid = RandomizedSearchCV(estimator=classifier, param_distributions=parameters, cv=10, n_iter=100, scoring="accuracy")
            robustment_test.loc[index, 'ML Model'] = ML_name
            grid.fit(X_train, y_train)
        else:
            grid = GridSearchCV(estimator=classifier, param_grid=parameters, cv=10, scoring="accuracy")
            robustment_test.loc[index, 'ML Model'] = ML_name
            grid.fit(X_train, y_train)
        training_error = 1 - (grid.best_score_)
        robustment_test.loc[index, 'Training Classification Error'] = training_error
        mdl = grid.best_estimator_.fit(X_train, y_train)
        pred = mdl.predict(X_test)
        test_error = 1 - (accuracy_score(y_test, pred))
        robustment_test.loc[index, 'Test Classification_Error'] = test_error
        print("Training classification error: ", training_error)
        print("Testing classification error: ", test_error)
        print("\n Robustness checks complete!")


index = 0
for classifier, parameter in zip(classifiers, parameters):
    grid(classifier, parameter,parallel_jobs)
    index += 1
----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for LogisticRegression

Performing Grid Search for LogisticRegression

Beginning to fit LogisticRegression

Optimal hyperparameters: {'C': 100, 'max_iter': 9}
Confusion matrix:
No description has been provided for this image
Hyperparameter tuning complete. Used 16 parallel jobs... Took 6.87 seconds.

----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for SVC

Performing Grid Search for SVC

Beginning to fit SVC

Optimal hyperparameters: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Confusion matrix:
No description has been provided for this image
Hyperparameter tuning complete. Used 16 parallel jobs... Took 2.66 seconds.

------------- Calculating SVC classification error for Robustness checks  -------------

Training classification error:  0.09943019943019937
Testing classification error:  0.1160714285714286

 Robustness checks complete!
----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for AdaBoostClassifier

Performing Randomized Grid Search for AdaBoostClassifier

Beginning to fit AdaBoostClassifier

Optimal hyperparameters: {'n_estimators': 400, 'learning_rate': 0.8}
Confusion matrix:
No description has been provided for this image
================================ feature_importances ================================
M/F == 0.045
Age == 0.1325
EDUC == 0.0625
SES == 0.0625
MMSE == 0.0375
eTIV == 0.22
nWBV == 0.2975
ASF == 0.1425
No description has been provided for this image
Hyperparameter tuning complete. Used 16 parallel jobs... Took 66.95 seconds.

----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for DecisionTreeClassifier

Performing Randomized Grid Search for DecisionTreeClassifier

Beginning to fit DecisionTreeClassifier

Optimal hyperparameters: {'splitter': 'best', 'min_samples_split': 6, 'max_leaf_nodes': 7, 'max_depth': 2, 'criterion': 'gini'}
Confusion matrix:
No description has been provided for this image
================================ feature_importances ================================
M/F == 0.0998291718520719
Age == 0.0
EDUC == 0.0
SES == 0.0
MMSE == 0.9001708281479281
eTIV == 0.0
nWBV == 0.0
ASF == 0.0
No description has been provided for this image
Hyperparameter tuning complete. Used 16 parallel jobs... Took 0.68 seconds.

----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for RandomForestClassifier

Performing Randomized Grid Search for RandomForestClassifier

Beginning to fit RandomForestClassifier

Optimal hyperparameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_leaf_nodes': 28}
Confusion matrix:
No description has been provided for this image
================================ feature_importances ================================
M/F == 0.06072693782658392
Age == 0.08072949191275151
EDUC == 0.08453418287763775
SES == 0.04067643562190814
MMSE == 0.3412779675591842
eTIV == 0.11370031119223188
nWBV == 0.16491149087601517
ASF == 0.11344318213368734
No description has been provided for this image
Hyperparameter tuning complete. Used 16 parallel jobs... Took 45.26 seconds.

------------- Calculating RandomForestClassifier classification error for Robustness checks  -------------

Training classification error:  0.1495726495726496
Testing classification error:  0.1964285714285714

 Robustness checks complete!
----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for GradientBoostingClassifier

Performing Randomized Grid Search for GradientBoostingClassifier

Beginning to fit GradientBoostingClassifier

Optimal hyperparameters: {'n_estimators': 84, 'min_samples_leaf': 8, 'max_depth': 15, 'learning_rate': 0.8}
Confusion matrix:
No description has been provided for this image
================================ feature_importances ================================
M/F == 0.06288633186170128
Age == 0.06195875992764563
EDUC == 0.06696296486554618
SES == 0.04390265085288797
MMSE == 0.4892290215022818
eTIV == 0.09386109718420312
nWBV == 0.13803520750280088
ASF == 0.04316396630293312
No description has been provided for this image
Hyperparameter tuning complete. Used 16 parallel jobs... Took 12.89 seconds.

----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for XGBClassifier

Performing Randomized Grid Search for XGBClassifier

Beginning to fit XGBClassifier

Optimal hyperparameters: {'subsample': 0.7, 'silent': True, 'reg_lambda': 0.1, 'n_estimators': 100, 'min_child_weight': 0.5, 'max_depth': 20, 'learning_rate': 0.2, 'gamma': 0.5, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8}
Confusion matrix:
No description has been provided for this image
================================ feature_importances ================================
M/F == 0.16292475
Age == 0.0908916
EDUC == 0.09762108
SES == 0.09007445
MMSE == 0.27741197
eTIV == 0.09610281
nWBV == 0.10608934
ASF == 0.07888401
No description has been provided for this image
Hyperparameter tuning complete. Used 16 parallel jobs... Took 2.41 seconds.

------------- Calculating XGBClassifier classification error for Robustness checks  -------------

Training classification error:  0.1760683760683761
Testing classification error:  0.2053571428571429

 Robustness checks complete!
In [42]:
finale.head(10)
finale[['ML Model','test AUC score']]
Out[42]:
ML Model Method fitting time optimal hyperparameters best training AUC score best estimator test recall score test AUC score
0 LogisticRegression Grid Search 6.858933 {'C': 100, 'max_iter': 9} 0.904043 LogisticRegression(C=100, max_iter=9, random_s... 0.7 0.744231
1 SVC Grid Search 2.657477 {'C': 10, 'gamma': 1, 'kernel': 'rbf'} 0.970661 SVC(C=10, gamma=1, random_state=42) 0.866667 0.885256
2 AdaBoostClassifier Randomized Grid Search 66.5541 {'n_estimators': 400, 'learning_rate': 0.8} 0.888874 AdaBoostClassifier(learning_rate=0.8, n_estima... 0.8 0.794231
3 DecisionTreeClassifier Randomized Grid Search 0.671705 {'splitter': 'best', 'min_samples_split': 6, '... 0.85049 DecisionTreeClassifier(max_depth=2, max_leaf_n... 0.6 0.751923
4 RandomForestClassifier Randomized Grid Search 45.050044 {'n_estimators': 300, 'min_samples_split': 5, ... 0.906121 RandomForestClassifier(max_leaf_nodes=28, min_... 0.766667 0.787179
5 GradientBoostingClassifier Randomized Grid Search 12.803396 {'n_estimators': 84, 'min_samples_leaf': 8, 'm... 0.931262 GradientBoostingClassifier(learning_rate=0.8, ... 0.783333 0.805128
6 XGBClassifier Randomized Grid Search 2.382946 {'subsample': 0.7, 'silent': True, 'reg_lambda... 0.913014 XGBClassifier(base_score=None, booster=None, c... 0.816667 0.841026
Out[42]:
ML Model test AUC score
0 LogisticRegression 0.744231
1 SVC 0.885256
2 AdaBoostClassifier 0.794231
3 DecisionTreeClassifier 0.751923
4 RandomForestClassifier 0.787179
5 GradientBoostingClassifier 0.805128
6 XGBClassifier 0.841026
In [43]:
#Check classification scores for the 3 main benchmark models
robustment_test
Out[43]:
ML Model Training Classification Error Test Classification_Error
1 SVC 0.09943 0.116071
4 RandomForestClassifier 0.149573 0.196429
6 XGBClassifier 0.176068 0.205357

Plotting the ROC curve for each classifier ...

In [44]:
##Setting the colour of the ROC curves for each model
colors = { 'LogisticRegression':'red',
            'SVC':'green',
            'DecisionTreeClassifier':'blue',
            'AdaBoostClassifier':'yellow',
            'RandomForestClassifier':'cyan',
            'XGBClassifier':'magenta', 
            'GradientBoostingClassifier': 'black'
        }    

#total_fpr.keys() returns the models name as the key
plt.figure(figsize=(20,12))
for i in total_fpr_opt.keys():
    colors = {'LogisticRegression':'red', 'SVC':'green', 'DecisionTreeClassifier':'blue', 'AdaBoostClassifier':'yellow','RandomForestClassifier':'cyan', 'XGBClassifier':'magenta', 'GradientBoostingClassifier': 'black', 'GaussianNB': 'white'}    
    plt.plot(total_fpr_opt[i],total_tpr_opt[i],colors[i], lw=1, label=i)
plt.xlim([0,1])
plt.ylim([0,1])
plt.title('Comparison of ROC curves for all classifiers')
plt.plot([0, 1], [0, 1], color='darkorange', lw=2, linestyle='--')
plt.legend();
No description has been provided for this image

Further Hyperparameter refinement using hyperopt¶

In [45]:
def Performance(model, y, X, fitting_time, best_params):
    predictions = model.predict(X)
    recall = recall_score(y, predictions, pos_label=1)
    fpr, tpr, _ = roc_curve(y, predictions)
    auc_score = auc(fpr, tpr)
    print('Recall score: {:.4f}'.format(recall))
    print('AUC score: {:.4f}'.format(auc_score))
    
    # Update finale table
    global index
    finale.loc[index, 'ML Model'] = model.__class__.__name__
    finale.loc[index, 'Method'] = 'Hyperopt tuning'
    finale.loc[index, 'fitting time'] = str(fitting_time)
    finale.loc[index, 'optimal hyperparameters'] = str(best_params)
    finale.loc[index, 'best training AUC score'] = auc_score
    finale.loc[index, 'best estimator'] = str(model)
    finale.loc[index, 'test recall score'] = recall
    finale.loc[index, 'test AUC score'] = auc_score
    
    # Increment index for the next entry
    index += 1

XGBoost hyperopt


In [46]:
def objective(params):
    params = {
        'n_estimators': params['n_estimators'],
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'min_child_weight': "{:.3f}".format(params['min_child_weight']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'random_state': params['random_state']
    }
    
    #clf = XGBClassifier(n_jobs=4,**params)
    clf = XGBClassifier(**params)
    skf = StratifiedKFold(n_splits=10, shuffle=True,random_state=42)
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
    print("roc_auc {:.3f} params {}".format(score, params))
    training_scores.append(score)
    return score

space = {
    'n_estimators': scope.int(hp.uniform('n_estimators',100, 2000)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.4),
    'max_depth': scope.int(hp.quniform('max_depth', 2, 14, 1)),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'min_child_weight': hp.uniform('min_child_weight',0.1, 0.6),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
    'random_state': 42 
}

#Capture training scores & fitting times as well
training_scores = []
start_time = time()
best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=10)
best_params = space_eval(space, best)
fitting_time = time() - start_time
print("\nHyperopt completed in {} seconds\n".format(fitting_time))
print("The average training score was {} ". format(sum(training_scores)/len(training_scores)))
roc_auc 0.887 params {'n_estimators': 1546, 'learning_rate': '0.129', 'max_depth': 4, 'gamma': '0.469', 'min_child_weight': '0.462', 'colsample_bytree': '0.954', 'random_state': 42}
roc_auc 0.886 params {'n_estimators': 910, 'learning_rate': '0.064', 'max_depth': 6, 'gamma': '0.374', 'min_child_weight': '0.452', 'colsample_bytree': '0.952', 'random_state': 42}
roc_auc 0.908 params {'n_estimators': 1777, 'learning_rate': '0.110', 'max_depth': 6, 'gamma': '0.267', 'min_child_weight': '0.388', 'colsample_bytree': '0.651', 'random_state': 42}
roc_auc 0.900 params {'n_estimators': 896, 'learning_rate': '0.072', 'max_depth': 12, 'gamma': '0.195', 'min_child_weight': '0.135', 'colsample_bytree': '0.822', 'random_state': 42}
roc_auc 0.889 params {'n_estimators': 563, 'learning_rate': '0.189', 'max_depth': 8, 'gamma': '0.300', 'min_child_weight': '0.149', 'colsample_bytree': '0.910', 'random_state': 42}
roc_auc 0.891 params {'n_estimators': 469, 'learning_rate': '0.189', 'max_depth': 12, 'gamma': '0.146', 'min_child_weight': '0.587', 'colsample_bytree': '0.920', 'random_state': 42}
roc_auc 0.890 params {'n_estimators': 1219, 'learning_rate': '0.299', 'max_depth': 12, 'gamma': '0.192', 'min_child_weight': '0.482', 'colsample_bytree': '0.971', 'random_state': 42}
roc_auc 0.907 params {'n_estimators': 825, 'learning_rate': '0.336', 'max_depth': 5, 'gamma': '0.179', 'min_child_weight': '0.123', 'colsample_bytree': '0.721', 'random_state': 42}
roc_auc 0.887 params {'n_estimators': 1065, 'learning_rate': '0.219', 'max_depth': 9, 'gamma': '0.346', 'min_child_weight': '0.462', 'colsample_bytree': '0.961', 'random_state': 42}
roc_auc 0.900 params {'n_estimators': 699, 'learning_rate': '0.220', 'max_depth': 3, 'gamma': '0.159', 'min_child_weight': '0.392', 'colsample_bytree': '0.864', 'random_state': 42}
100%|██████████| 10/10 [00:04<00:00,  2.40trial/s, best loss: 0.8856679894179894]

Hyperopt completed in 4.178892374038696 seconds

The average training score was 0.894657671957672 

Using the best parameters computed using Hypteropt to calculate a Recall and AUC score:

In [47]:
# Call Performance function after hyperparameter tuning
mdl = XGBClassifier(**best_params)
mdl.fit(X_train, y_train)
Performance(mdl, y_test, X_test, fitting_time, best_params)
Out[47]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9516347225548255, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.37420938791338665,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.06443939093069395,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=0.4520573760002473, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=910,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9516347225548255, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.37420938791338665,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.06443939093069395,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=0.4520573760002473, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=910,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
Recall score: 0.7833
AUC score: 0.8051

RandomForest hyperopt


In [48]:
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'n_estimators':int(params['n_estimators']),
        'random_state': params['random_state']
    }
    
    #clf = RandomForestClassifier(n_jobs=4,**params)
    clf = RandomForestClassifier(**params)
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
    print("roc_auc {:.3f} params {}".format(score, params))
    training_scores.append(score)
    return score

space = {
    'max_depth': scope.int(hp.quniform('max_depth', 10,30, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 100,1500,25)),
    'random_state': 42 }

#Capture training scores & fitting times as well
training_scores = []
start_time = time()
best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=10)
best_params = space_eval(space, best)
fitting_time = time() - start_time
print("\nHyperopt completed in {} seconds\n".format(fitting_time))
print("The average training score was {} ". format(sum(training_scores)/len(training_scores)))
roc_auc 0.923 params {'max_depth': 26, 'n_estimators': 625, 'random_state': 42}
roc_auc 0.923 params {'max_depth': 18, 'n_estimators': 450, 'random_state': 42} 
roc_auc 0.924 params {'max_depth': 13, 'n_estimators': 725, 'random_state': 42} 
roc_auc 0.923 params {'max_depth': 13, 'n_estimators': 225, 'random_state': 42} 
roc_auc 0.924 params {'max_depth': 27, 'n_estimators': 425, 'random_state': 42} 
roc_auc 0.918 params {'max_depth': 13, 'n_estimators': 150, 'random_state': 42} 
roc_auc 0.917 params {'max_depth': 25, 'n_estimators': 125, 'random_state': 42} 
roc_auc 0.922 params {'max_depth': 18, 'n_estimators': 575, 'random_state': 42} 
roc_auc 0.922 params {'max_depth': 12, 'n_estimators': 1450, 'random_state': 42}
roc_auc 0.922 params {'max_depth': 28, 'n_estimators': 1200, 'random_state': 42}
100%|██████████| 10/10 [00:20<00:00,  2.09s/trial, best loss: 0.9165542328042328]

Hyperopt completed in 20.87399196624756 seconds

The average training score was 0.9216392857142857 
In [49]:
# Call Performance function after hyperparameter tuning
mdl = RandomForestClassifier(**best_params)
mdl.fit(X_train, y_train)
Performance(mdl, y_test, X_test, fitting_time, best_params)
Out[49]:
RandomForestClassifier(max_depth=25, n_estimators=125, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=25, n_estimators=125, random_state=42)
Recall score: 0.8333
AUC score: 0.8590

SVC hyperopt


In [50]:
def objective(params):
    params = {
        'C': params['C'],
        'gamma':params['gamma'],
        'kernel': params['kernel'],
        'random_state': params['random_state']
    }
    
    clf = SVC(**params)
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
    print("roc_auc {:.3f} params {}".format(score, params))
    training_scores.append(score)
    return score

space = {
    'C': hp.uniform('C', 10,11),
    'gamma': hp.quniform('gamma', 0.35,0.42,0.01),    
    'kernel': 'rbf', 
    'random_state': 42 }

#Capture training scores & fitting times as well
training_scores = []
start_time = time()
best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=10)
best_params = space_eval(space, best)
fitting_time = time() - start_time
print("\nHyperopt completed in {} seconds\n".format(fitting_time))
print("The average training score was {} ". format(sum(training_scores)/len(training_scores)))
roc_auc 0.931 params {'C': 10.864562445314965, 'gamma': 0.39, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.933 params {'C': 10.050081000530929, 'gamma': 0.4, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.933 params {'C': 10.59883981122294, 'gamma': 0.4, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.929 params {'C': 10.8091990797788, 'gamma': 0.37, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.928 params {'C': 10.01700859922015, 'gamma': 0.37, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.937 params {'C': 10.970767038409553, 'gamma': 0.41000000000000003, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.936 params {'C': 10.741369260572458, 'gamma': 0.41000000000000003, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.935 params {'C': 10.08465967125343, 'gamma': 0.41000000000000003, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.927 params {'C': 10.616443464899518, 'gamma': 0.36, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.937 params {'C': 10.849924492903604, 'gamma': 0.42, 'kernel': 'rbf', 'random_state': 42}
100%|██████████| 10/10 [00:00<00:00, 37.84trial/s, best loss: 0.9266600529100529]

Hyperopt completed in 0.2672393321990967 seconds

The average training score was 0.9324859788359788 
In [51]:
# Call Performance function after hyperparameter tuning
mdl = SVC(**best_params)
mdl.fit(X_train, y_train)
Performance(mdl, y_test, X_test, fitting_time, best_params)
Out[51]:
SVC(C=10.616443464899518, gamma=0.36, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(C=10.616443464899518, gamma=0.36, random_state=42)
Recall score: 0.8667
AUC score: 0.9045

Section 4: Results¶


4.1 Model Evaluation and Validation¶

Going to compare the 2 dataframes -- finale & benchmark -- to show the differences due to parameter tuning ...

In [52]:
def process_results(finale, benchmark):
    # Rename column in benchmark DataFrame
    benchmark.rename(columns={'ML Algo name': 'ML Model'}, inplace=True)
    
    # Extract relevant columns from finale DataFrame and sort by ML Model
    output = finale[['ML Model', 'Method', 'test AUC score']].sort_values('ML Model')
    
    # Set Method column to 'untuned' in benchmark DataFrame
    benchmark['Method'] = 'untuned'
    
    # Extract relevant columns from benchmark DataFrame
    benchmark = benchmark[['ML Model', 'Method', 'test AUC score']]
    
    # Concatenate output and benchmark DataFrames, sort by ML Model, and set index
    df = pd.concat([output, benchmark]).sort_values('ML Model')
    df.set_index('ML Model', inplace=True)
    
    return df
In [53]:
output = process_results(finale,benchmark)
output
Out[53]:
Method test AUC score
ML Model
AdaBoost Classifier untuned 0.766667
AdaBoostClassifier Randomized Grid Search 0.794231
Decision Tree Classifier untuned 0.748718
DecisionTreeClassifier Randomized Grid Search 0.751923
GradientBoostingClassifier Randomized Grid Search 0.805128
GradientBoostingClassifier untuned 0.821795
Logistic Regression untuned 0.744231
LogisticRegression Grid Search 0.744231
Naive Bayes untuned 0.748077
Random Forest Classifier untuned 0.86859
RandomForestClassifier Hyperopt tuning 0.858974
RandomForestClassifier Randomized Grid Search 0.787179
SVC Hyperopt tuning 0.904487
SVC untuned 0.776923
SVC Grid Search 0.885256
XGBClassifier Hyperopt tuning 0.805128
XGBClassifier untuned 0.80641
XGBClassifier Randomized Grid Search 0.841026
In [54]:
# Extract the 'untuned' test AUC score for each ML Model and store it in a dictionary
untuned_dict = {}
for model, group in output[output['Method'] == 'untuned'].groupby('ML Model'):
    untuned_dict[model] = group.iloc[0]['test AUC score']

# List of methods to check against
methods_to_check = ['Randomized Grid Search', 'Hyperopt tuning', 'Grid Search']

# Print out the test AUC scores for each ML model with Randomized Grid Search method that outperformed the untuned method
for model, group in output.groupby('ML Model'):
    if any(method in group['Method'].values for method in methods_to_check):
        for method in methods_to_check:
            if method in group['Method'].values:
                method_score = group[group['Method'] == method]['test AUC score'].values[0]
                if model in untuned_dict and method_score > untuned_dict[model]:
                    print(f"The following model and optimisation method outperformed its benchmark value of {untuned_dict[model]}")
                    print(f"ML Model: {model}")
                    print(f"Method: {method}")
                    print(f"Test AUC score: {method_score}")
                    print()
The following model and optimisation method outperformed its benchmark value of 0.7769230769230769
ML Model: SVC
Method: Hyperopt tuning
Test AUC score: 0.9044871794871795

The following model and optimisation method outperformed its benchmark value of 0.7769230769230769
ML Model: SVC
Method: Grid Search
Test AUC score: 0.8852564102564102

The following model and optimisation method outperformed its benchmark value of 0.8064102564102564
ML Model: XGBClassifier
Method: Randomized Grid Search
Test AUC score: 0.841025641025641

Section 5: Conclusion¶


5.1 Free-form visualization of SVM Decision Boundaries¶
In [55]:
#Define a meshgrid function that will plot the decision surface
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

#Select 2 features randomly to compare - SES / EDUC in this case
#Convert to array
X = np.array(X_test)
X = X[:, [2,3]]

#ax1 --> benchmark model
#ax2 --> optimised model
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True, figsize=(20,10))
title = ('Decision surface for Benchmark model')
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
mdl = SVC(C=1, gamma='auto', kernel='rbf', random_state=42).fit(X, y_test)
plot_contours(ax1, mdl, xx, yy, cmap=plt.cm.coolwarm, alpha=0.7)
ax1.scatter(X0, X1, c=y_test, cmap=plt.cm.coolwarm, s=20, marker='x', edgecolors='k')
ax1.set_xlabel(X_test.columns[2])
ax1.set_ylabel(X_test.columns[3])
ax1.set_xticks(())
ax1.set_yticks(())
ax1.legend()
ax1.set_title(title);


title = ('Decision surface for Optimized model')
mdl = SVC(C=10.94, gamma=0.36, kernel='rbf', random_state=42).fit(X, y_test)
plot_contours(ax2, mdl, xx, yy, cmap=plt.cm.coolwarm, alpha=0.7)
ax2.scatter(X0, X1, c=y_test, cmap=plt.cm.coolwarm, s=25, marker='x', edgecolors='k')
ax2.set_xlabel(X_test.columns[2])
ax2.set_ylabel(X_test.columns[3])
ax2.set_xticks(())
ax2.set_yticks(())
ax2.set_title(title);
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No description has been provided for this image